From 705ea2b54c35ca51c24dee51e4d88ffb8a7dc889 Mon Sep 17 00:00:00 2001
From: "kaf24@firebug.cl.cam.ac.uk" <kaf24@firebug.cl.cam.ac.uk>
Date: Sat, 25 Feb 2006 21:28:27 +0100
Subject: [PATCH] New VCPUOP_register_runstate_memory_area hypercall. Avoids
 need for a hypercall in the guest timer interrupt handler.

Cleaned up stolen/blocked tick handling in Linux.

Signed-off-by: Keir Fraser <keir@xensource.com>
---
 .../arch/i386/kernel/time-xen.c               | 89 ++++++++++---------
 xen/arch/x86/domain.c                         |  5 ++
 xen/common/domain.c                           | 22 +++++
 xen/include/public/vcpu.h                     | 23 ++++-
 xen/include/xen/sched.h                       |  1 +
 5 files changed, 99 insertions(+), 41 deletions(-)

diff --git a/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c b/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c
index 8a82b9ca61..864bf17daf 100644
--- a/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c
@@ -130,6 +130,9 @@ static DEFINE_PER_CPU(u64, processed_system_time);
 static DEFINE_PER_CPU(u64, processed_stolen_time);
 static DEFINE_PER_CPU(u64, processed_blocked_time);
 
+/* Current runstate of each CPU (updated automatically by the hypervisor). */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
 #define NS_PER_TICK (1000000000LL/HZ)
 
@@ -575,19 +578,36 @@ EXPORT_SYMBOL(profile_pc);
 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
 	s64 delta, delta_cpu, stolen, blocked;
+	u64 sched_time;
 	int i, cpu = smp_processor_id();
 	struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
-	struct vcpu_runstate_info runstate;
+	struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
 
 	write_seqlock(&xtime_lock);
 
 	do {
 		get_time_values_from_xen();
 
+		/* Obtain a consistent snapshot of elapsed wallclock cycles. */
 		delta = delta_cpu = 
 			shadow->system_timestamp + get_nsec_offset(shadow);
 		delta     -= processed_system_time;
 		delta_cpu -= per_cpu(processed_system_time, cpu);
+
+		/*
+		 * Obtain a consistent snapshot of stolen/blocked cycles. We
+		 * can use state_entry_time to detect if we get preempted here.
+		 */
+		do {
+			sched_time = runstate->state_entry_time;
+			barrier();
+			stolen = runstate->time[RUNSTATE_runnable] +
+				runstate->time[RUNSTATE_offline] -
+				per_cpu(processed_stolen_time, cpu);
+			blocked = runstate->time[RUNSTATE_blocked] -
+				per_cpu(processed_blocked_time, cpu);
+			barrier();
+		} while (sched_time != runstate->state_entry_time);
 	}
 	while (!time_values_up_to_date(cpu));
 
@@ -619,60 +639,44 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 
 	write_sequnlock(&xtime_lock);
 
-	/* Obtain stolen/blocked cycles, if the hypervisor supports it. */
-	if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info,
-			       cpu, &runstate) == 0) {
-		/*
-		 * Account stolen ticks.
-		 * HACK: Passing NULL to account_steal_time()
-		 * ensures that the ticks are accounted as stolen.
-		 */
-		stolen = runstate.time[RUNSTATE_runnable] +
-			runstate.time[RUNSTATE_offline] -
-			per_cpu(processed_stolen_time, cpu);
-		if (unlikely(stolen < 0)) /* clock jitter */
-			stolen = 0;
+	/*
+	 * Account stolen ticks.
+	 * HACK: Passing NULL to account_steal_time()
+	 * ensures that the ticks are accounted as stolen.
+	 */
+	if (stolen > 0) {
 		delta_cpu -= stolen;
-		if (unlikely(delta_cpu < 0)) {
-			stolen += delta_cpu;
-			delta_cpu = 0;
-		}
 		do_div(stolen, NS_PER_TICK);
 		per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
+		per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
 		account_steal_time(NULL, (cputime_t)stolen);
+	}
 
-		/*
-		 * Account blocked ticks.
-		 * HACK: Passing idle_task to account_steal_time()
-		 * ensures that the ticks are accounted as idle/wait.
-		 */
-		blocked = runstate.time[RUNSTATE_blocked] -
-			per_cpu(processed_blocked_time, cpu);
-		if (unlikely(blocked < 0)) /* clock jitter */
-			blocked = 0;
+	/*
+	 * Account blocked ticks.
+	 * HACK: Passing idle_task to account_steal_time()
+	 * ensures that the ticks are accounted as idle/wait.
+	 */
+	if (blocked > 0) {
 		delta_cpu -= blocked;
-		if (unlikely(delta_cpu < 0)) {
-			blocked += delta_cpu;
-			delta_cpu = 0;
-		}
 		do_div(blocked, NS_PER_TICK);
 		per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
+		per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
 		account_steal_time(idle_task(cpu), (cputime_t)blocked);
-
-		per_cpu(processed_system_time, cpu) +=
-			(stolen + blocked) * NS_PER_TICK;
 	}
 
+	/* Account user/system ticks. */
 	if (delta_cpu > 0) {
 		do_div(delta_cpu, NS_PER_TICK);
+		per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
 		if (user_mode(regs))
 			account_user_time(current, (cputime_t)delta_cpu);
 		else
 			account_system_time(current, HARDIRQ_OFFSET,
 					    (cputime_t)delta_cpu);
-		per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
 	}
 
+	/* Local timer processing (see update_process_times()). */
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_mode(regs));
@@ -684,14 +688,19 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 
 static void init_missing_ticks_accounting(int cpu)
 {
-	struct vcpu_runstate_info runstate = { 0 };
+	struct vcpu_register_runstate_memory_area area;
+	struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
+
+	memset(runstate, 0, sizeof(*runstate));
 
-	HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate);
+	area.addr.v = runstate;
+	HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
 
-	per_cpu(processed_blocked_time, cpu) = runstate.time[RUNSTATE_blocked];
+	per_cpu(processed_blocked_time, cpu) =
+		runstate->time[RUNSTATE_blocked];
 	per_cpu(processed_stolen_time, cpu) =
-		runstate.time[RUNSTATE_runnable] +
-		runstate.time[RUNSTATE_offline];
+		runstate->time[RUNSTATE_runnable] +
+		runstate->time[RUNSTATE_offline];
 }
 
 /* not static: needed by APM */
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 1cec5b9aa0..4f7da5a96c 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -784,6 +784,11 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
 
     context_saved(prev);
 
+    /* Update per-VCPU guest runstate shared memory area (if registered). */
+    if ( next->runstate_guest != NULL )
+        __copy_to_user(next->runstate_guest, &next->runstate,
+                       sizeof(next->runstate));
+
     schedule_tail(next);
     BUG();
 }
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 598d7e1b69..b80d8398e4 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -461,6 +461,28 @@ long do_vcpu_op(int cmd, int vcpuid, void *arg)
         break;
     }
 
+    case VCPUOP_register_runstate_memory_area:
+    {
+        struct vcpu_register_runstate_memory_area area;
+
+        rc = -EINVAL;
+        if ( v != current )
+            break;
+
+        rc = -EFAULT;
+        if ( copy_from_user(&area, arg, sizeof(area)) )
+            break;
+
+        if ( !access_ok(area.addr.v, sizeof(*area.addr.v)) )
+            break;
+
+        rc = 0;
+        v->runstate_guest = area.addr.v;
+        __copy_to_user(v->runstate_guest, &v->runstate, sizeof(v->runstate));
+
+        break;
+    }
+
     default:
         rc = -ENOSYS;
         break;
diff --git a/xen/include/public/vcpu.h b/xen/include/public/vcpu.h
index 8a425b57da..1c36f81655 100644
--- a/xen/include/public/vcpu.h
+++ b/xen/include/public/vcpu.h
@@ -53,7 +53,7 @@
 
 /*
  * Return information about the state and running time of a VCPU.
- * @extra_arg == pointer to xen_vcpu_info structure.
+ * @extra_arg == pointer to vcpu_runstate_info structure.
  */
 #define VCPUOP_get_runstate_info    4
 typedef struct vcpu_runstate_info {
@@ -85,6 +85,27 @@ typedef struct vcpu_runstate_info {
  */
 #define RUNSTATE_offline  3
 
+/*
+ * Register a shared memory area from which the guest may obtain its own
+ * runstate information without needing to execute a hypercall.
+ * Notes:
+ *  1. The registered address may be virtual or physical, depending on the
+ *     platform. The virtual address should be registered on x86 systems.
+ *  2. Only one shared area may be registered per VCPU. The shared area is
+ *     updated by the hypervisor each time the VCPU is scheduled. Thus
+ *     runstate.state will always be RUNSTATE_running and
+ *     runstate.state_entry_time will indicate the system time at which the
+ *     VCPU was last scheduled to run.
+ * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
+ */
+#define VCPUOP_register_runstate_memory_area 5
+typedef struct vcpu_register_runstate_memory_area {
+    union {
+        struct vcpu_runstate_info *v;
+        uint64_t p;
+    } addr;
+} vcpu_register_runstate_memory_area_t;
+
 #endif /* __XEN_PUBLIC_VCPU_H__ */
 
 /*
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index f6ab18b31b..91f457702a 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -70,6 +70,7 @@ struct vcpu
     void            *sched_priv;    /* scheduler-specific data */
 
     struct vcpu_runstate_info runstate;
+    struct vcpu_runstate_info *runstate_guest; /* guest address */
 
     unsigned long    vcpu_flags;
 
-- 
2.30.2